#' ---
#' title: "DOL-ILAB SDC - Nepal Round 3_FinalClean_Labor"
#' author: "Pablo Diego-Rosell, PhD"
#' email: "pablo_diego_rosell@yahoo.com"
#' output:
#'    html_document:
#'      toc: true
#' theme: united
#' ---

rm(list=ls(all=t))

#' #Setup and crate dictionary

filename <- "Nepal Round 3_FinalClean_Labor" # !!!Update filename
source ("functions_1.7.R")

#' Visually inspect variables in "dictionary.csv" and flag for risk, using the following flags: 
# Direct PII: Respondent Names, Addresses, Identification Numbers, Phone Numbers
# Direct PII-team: Interviewer Names, other field team names 
# Indirect PII-ordinal: Date of birth, Age, income, education, household composition. 
# Indirect PII-categorical: Gender, education, ethnicity, nationality,
# occupation, employer, head of household, marital status
# GPS: Longitude, Latitude
# Location: Small Location (<100,000) Large Location (>100,000)
# Weight: weightVar
# Household ID:  hhId, 
# Open-ends: Review responses for any sensitive information, redact as necessary 

#!!!Save flagged dictionary in .xlsx format and continue processing data with subset of flagged variables

#' #Direct PII: variables to be removed

# !!!Include any Direct PII variables
dropvars <- c("RvwName", "IDR3_18", "IDR3_19", "LE_reportedby", "flag_reportedby") 
mydata <- mydata[!names(mydata) %in% dropvars]

#' #Direct PII-team: Encode interviewer names, which may be useful for analysis of interviewer effects
#' !!!Replace vector in "variables" field below with relevant variable names

# Encode Direct PII-team

mydata <- encode_direct_PII_team (variables=c("Srvyr", "surveyor"))

#' #Small locations: Encode locations  with pop <100,000 using random large numbers
#'  !!!Include relevant variables, but check their population size first to confirm they are <100,000

locvars <- c("vdc", "IDR3_6_19","IDR3_6_22","IDR3_6_23","IDR3_6_24","IDR3_6_26","IDR3_6_30","IDR3_6_31","IDR3_6_35", "IDR3_7") 
mydata <- encode_location (variables= locvars, missing=999999)

#' #Indirect PII - Ordinal: Global recode or Top/bottom coding for extreme values
# Focus on variables with a "Lowest Freq" of 10 or less. 

break_age <- c(15,25,35,45,55,65,100)
labels_age <- c("15-24" =1, 
                "25-34" =2, 
                "35-44" =3, 
                "45-54" =4, 
                "55-64" =5, 
                "65 and older" =6, 
                "NA" = 7)
mydata <- ordinal_recode (variable="IDR3_20", break_points=break_age, missing=999999, value_labels=labels_age)
mydata <- ordinal_recode (variable="age", break_points=break_age, missing=999999, value_labels=labels_age)

# !!!Include relevant variables in list below

indirect_PII <- c("D_9", 
                  "HC2_O1", 
                  "HC2_O2", 
                  "HC2_O3", 
                  "HC2_O4", 
                  "HC2_O5", 
                  "HC2_O6", 
                  "H2_12_TEXT", 
                  "HC3", 
                  "HC4_1", 
                  "HC4_2", 
                  "HC4_3", 
                  "HC4_4", 
                  "D_4", 
                  "Inc_17", 
                  "con1A_gender_I1", 
                  "con1A_age_I1", 
                  "con1A_caste_I1", 
                  "con1A_crime_I1", 
                  "con1A_income_I1", 
                  "con1A_educ_I1", 
                  "con1B_gender_I1", 
                  "con1B_age_I1", 
                  "con1B_caste_I1", 
                  "con1B_crime_I1", 
                  "con1B_income_I1", 
                  "con1B_educ_I1", 
                  "T_233_1_I1", 
                  "con1A_gender_I2", 
                  "con1A_age_I2", 
                  "con1A_caste_I2", 
                  "con1A_income_I2", 
                  "con1A_educ_I2", 
                  "con1B_gender_I2", 
                  "con1B_age_I2", 
                  "con1B_caste_I2", 
                  "con1B_income_I2", 
                  "con1B_educ_I2", 
                  "con1A_gender_I3", 
                  "con1A_age_I3", 
                  "con1A_caste_I3", 
                  "con1A_income_I3", 
                  "con1A_educ_I3", 
                  "con1B_gender_I3", 
                  "con1B_age_I3", 
                  "con1B_caste_I3", 
                  "con1B_income_I3", 
                  "con1B_educ_I3", 
                  "con2A_gender_I1", 
                  "con2A_age_I1", 
                  "con2A_caste_I1", 
                  "con2A_income_I1", 
                  "con2A_educ_I1", 
                  "con2B_gender_I1", 
                  "con2B_age_I1", 
                  "con2B_caste_I1", 
                  "con2B_income_I1", 
                  "con2B_educ_I1", 
                  "con2A_gender_I2", 
                  "con2A_age_I2", 
                  "con2A_caste_I2", 
                  "con2A_income_I2", 
                  "con2A_educ_I2", 
                  "con2B_gender_I2", 
                  "con2B_age_I2", 
                  "con2B_caste_I2", 
                  "con2B_income_I2", 
                  "con2B_educ_I2", 
                  "con2A_gender_I3", 
                  "con2A_age_I3", 
                  "con2A_caste_I3", 
                  "con2A_income_I3", 
                  "con2A_educ_I3", 
                  "con2B_gender_I3", 
                  "con2B_age_I3", 
                  "con2B_caste_I3", 
                  "con2B_income_I3", 
                  "con2B_educ_I3", 
                  "P1", 
                  "P1A", 
                  "P2", 
                  "P2A", 
                  "P3", 
                  "P3A", 
                  "P4", 
                  "P4A", 
                  "P8_O1", 
                  "P8_O2", 
                  "P8_O3", 
                  "P8_3_number", 
                  "P8_4_number", 
                  "P8_5_number", 
                  "P12A", 
                  "P12A_TEXT", 
                  "P13A_O1", 
                  "P13A_O2", 
                  "P13A_10_TEXT", 
                  "P9B", 
                  "P10B", 
                  "P12B", 
                  "P13B_O1", 
                  "P13B_O2", 
                  "P13B_10_TEXT", 
                  "P9C_I1", 
                  "P10C_I1", 
                  "P11C_I1", 
                  "P11_A3_I1", 
                  "P12C_I1", 
                  "P12C_TEXT_I1", 
                  "P13C_O1_I1", 
                  "P13C_10_TEXT_I1", 
                  "P9C_I2", 
                  "P10C_I2", 
                  "P11C_I2", 
                  "P11_A3_I2", 
                  "P12C_I2", 
                  "P9D_I1", 
                  "P10D_I1", 
                  "P11D_I1", 
                  "P11_A4_I1", 
                  "P12D_I1", 
                  "P13D_O1_I1", 
                  "P13D_10_TEXT_I1", 
                  "P9D_I2", 
                  "P10D_I2", 
                  "P11D_I2", 
                  "P11_A4_I2", 
                  "P12D_I2", 
                  "P13D_O1_I2", 
                  "P13D_O2_I2", 
                  "P9E_I1", 
                  "P10E_I1", 
                  "P11E_I1", 
                  "P11_A5_I1", 
                  "P12E_I1", 
                  "P13E_O1_I1", 
                  "P13E_O2_I1", 
                  "P9E_I2", 
                  "P10E_I2", 
                  "P11E_I2", 
                  "P11_A5_I2", 
                  "P12E_I2", 
                  "P13E_O1_I2", 
                  "P14E_O1_I2", 
                  "P9E_I3", 
                  "P10E_I3", 
                  "P11E_I3", 
                  "P11_A5_I3", 
                  "P12E_I3", 
                  "P13E_O1_I3", 
                  "P14E_O1_I3", 
                  "P20A", 
                  "P19B", 
                  "P18C_I1", 
                  "P19C_I1", 
                  "P20C_I1", 
                  "P18C_I2", 
                  "P19C_I2", 
                  "P20C_I2", 
                  "P18D_I1", 
                  "P19D_I1", 
                  "P20D_I1", 
                  "P18D_I2", 
                  "P19D_I2", 
                  "P20D_I2", 
                  "P18E_I1", 
                  "P19E_I1", 
                  "P20E_I1", 
                  "NEW_2", 
                  "NEW_2_cl_I1", 
                  "P19_cl_I1", 
                  "D_9_cl_I1", 
                  "D_4_cl_I1", 
                  "NEW_2_cl_I2", 
                  "P19_cl_I2", 
                  "D_9_cl_I2", 
                  "D_4_cl_I2", 
                  "NEW_2_cl_I3", 
                  "P19_cl_I3", 
                  "D_9_cl_I3", 
                  "D_4_cl_I3", 
                  "NEW_2_cl_I4", 
                  "P19_cl_I4", 
                  "D_9_cl_I4", 
                  "D_4_cl_I4", 
                  "NEW_2_cl_I5", 
                  "P19_cl_I5", 
                  "D_9_cl_I5", 
                  "D_4_cl_I5", 
                  "NEW_2_cl_I6", 
                  "P19_cl_I6", 
                  "D_4_cl_I6", 
                  "D_8_cl_I6", 
                  "E2_2", 
                  "child_int", 
                  "forcedmarriage", 
                  "FM_self", 
                  "FM_spouse", 
                  "FM_child", 
                  "FM_childnum", 
                  "FM_parent", 
                  "FM_parentnum", 
                  "FM_sib", 
                  "FM_sibnum", 
                  "FM_self_aschild", 
                  "FM_spouse_aschild", 
                  "FM_child_aschild1", 
                  "FM_child_aschild2", 
                  "FM_parent_aschild1", 
                  "FM_parent_aschild2", 
                  "FM_sib_aschild1", 
                  "income", 
                  "incq", 
                  "noeduc", 
                  "rchild_int1", 
                  "rchild_int2", 
                  "rchild_int3", 
                  "rchild_int4", 
                  "rchild_int5", 
                  "rchild_int6", 
                  "age_rc1", 
                  "age_rc2", 
                  "age_rc3", 
                  "age_rc4", 
                  "age_rc5", 
                  "age_rc6", 
                  "menace_rc1", 
                  "menace_rc2", 
                  "noeduc_rc", 
                  "allchild_int_hh", 
                  "rchild_int_hh", 
                  "agegroup_rc1", 
                  "agegroup_rc2", 
                  "agegroup_rc3", 
                  "fiveyears_rcnum", 
                  "incqb_rc1", 
                  "incqb_rc2", 
                  "incqb_rc3", 
                  "incqb_rc4", 
                  "incqb_rc5", 
                  "incqb_rc6", 
                  "incqb_rcnum", 
                  "hhnoeduc_rc1", 
                  "hhnoeduc_rc2", 
                  "hhnoeduc_rc3", 
                  "hhnoeduc_rc4", 
                  "hhnoeduc_rc5", 
                  "hhnoeduc_rcnum")

capture_tables (indirect_PII)

# Recode those with very specific values where more than half of the sample have actual data. 

mydata <- mydata[!names(mydata) %in% "H2_12_TEXT"] # Drop as actually verbatim data in Nepali

# P3 - Number of siblings, topcode cases with 10 or more than 10 siblings. 

mydata2 <- encode_direct_PII_team (variables="E2_2") # Encode as low frequencies on languages. 
mydata <- top_recode ("HC3", break_point=5, missing=c(888, 999999)) # Topcode cases with 5 or more adult household members. 

# Top code high income to the 99.5 percentile

percentile_99.5 <- floor(quantile(mydata$Inc_17[mydata$Inc_17!=999999], probs = c(0.995)))
mydata <- top_recode (variable="Inc_17", break_point=percentile_99.5, missing=999999)

#percentile_99.5 <- floor(quantile(mydata$income[mydata$income!=999999], probs = c(0.995), na.rm=TRUE))
mydata2 <- top_recode (variable="income", break_point=percentile_99.5, missing=999999)

#' #Matching and crosstabulations: Run automated PII check 
 
# Based on dictionary inspection, select variables for creating sdcMicro object
# See: https://sdcpractice.readthedocs.io/en/latest/anon_methods.html
# All variable names should correspond to the names in the data file
# selected categorical key variables: gender, occupation/education and age
selectedKeyVars = c('D_4', 'IDR3_20', 'con1A_gender_I1') ##!!! Replace with candidate categorical demo vars
# creating the sdcMicro object with the assigned variables
sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

# Recode of education and age to reduce risk of re-identification 

break_edu <- c(0,6,9,11,12,13,15,16,777,888,999)
labels_edu <- c("Primary or less (0-5)" = 1,
                "Lower secondary (6-8)" = 2,
                "Secondary (9-10)" = 3,
                "SLC (11)" = 4,
                "CLASS 12/Intermediate level (12)" = 5,
                "Bachelor/Postgraduate level" = 6,
                "Literate, but never attended school" = 7,
                "Illiterate, and never attended school"= 8,
                "Does not apply"= 9,
                "Don't Know"= 10,
                "NA"= 11)
mydata <- ordinal_recode (variable="HC4_1", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_2", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_3", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="HC4_4", break_points=break_edu, missing=999999, value_labels=labels_edu)
mydata <- ordinal_recode (variable="D_4", break_points=break_edu, missing=999999, value_labels=labels_edu)

# Re-run to check 2-anonimity

sdcInitial <- createSdcObj(dat = mydata, keyVars = selectedKeyVars)
sdcInitial

#' Show values of key variable of records that violate k-anonymity
#mydata <- labelDataset(mydata)
notAnon <- sdcInitial@risk$individual[,2] < 2 # for 2-anonymity
mydata[notAnon,selectedKeyVars]
sdcFinal <- localSuppression(sdcInitial)

# Recombining anonymized variables

extractManipData(sdcFinal)[notAnon,selectedKeyVars] # manipulated variables HH
mydata [notAnon,"D_4"] <- 11

#' #Open-ends: review responses for any sensitive information, redact as necessary
#' 
# !!! Identify open-end variables here: 
open_ends <- c("H2_12_TEXT_Translation",
               "HTNx3_2_14_TEXT_Translation",
               "HTN_5x3_TEXT_Translation",
               "HTV_1_10_TEXTx3_Translation",
               "HTV_3_11_TEXTx3_Translation",
               "CPR5i_TEXT_Translation",
               "G1_00_08_TEXT_Translation",
               "P13A_10_TEXT_Translation",
               "P14A_12_TEXT_Translation",
               "SIMPOC7A_10_TEXT_Translation",
               "P13B_10_TEXT_Translation",
               "P14B_12_TEXT_Translation",
               "SIMPOC7B_10_TEXT_Translation",
               "P13C_10_TEXT_I1_Translation",
               "P14C_12_TEXT_I1_Translation",
               "SIMPOC7C_10_TEXT_I1_Translation",
               "P14C_12_TEXT_I2_Translation",
               "P13D_10_TEXT_I1_Translation",
               "P14D_12_TEXT_I1_Translation",
               "P14D_12_TEXT_I2_Translation",
               "P13E_10_TEXT_I1_Translation",
               "P14E_12_TEXT_I1_Translation",
               "SIMPOC7E_10_TEXT_I1_Translation",
               "P14E_12_TEXT_I2_Translation",
               "P14E_12_TEXT_I3_Translation",
               "NEW_3_12_TEXT_Translation",
               "NEW_9_TEXT_Translation",
               "SIMPOC7_cl_10_TEXT_I1_Translate",
               "SIMPOC7_cl_10_TEXT_I2_Translate",
               "NEW_10_TEXT_Translation",
               "P13_cl_O3_TEXT_I1_Translation",
               "NEW_9_cl_TEXT_I1_Translation",
               "NEW_9_cl_TEXT_I2_Translation",
               "NEW_9_cl_TEXT_I3_Translation",
               "P14_cl_O2_I1_TEXT_Translation",
               "P13_cl_O2_TEXT_I2_Translation",
               "SIMPOC7_cl_10_TEXT_I3_Translate",
               "P14_cl_O1_I3_TEXT_Translation",
               "P14_cl_O1_I2_TEXT_Translation",
               "e3e_TEXT_Translation",
               "E2_11_8_TEXT_Translation",
               "E_14_7_TEXT_Translation")

report_open (list_open_ends = open_ends)

# Review "verbatims.csv". Identify variables to be deleted or redacted and their row number 

mydata$E_14_7_TEXT_Translation[1313] <- "Respondent's bother was tricked in bad activities and later threatened to help [activity redacted]"
mydata$E_14_7_TEXT_Translation[1694] <- "In Q64, respondent said there was no income and later in Q307 respondent said [amount redacted] so entered the option more than 12,000 in Q307"
mydata$E_14_7_TEXT_Translation[1907] <- "GPS did not capture for about 20 minutes and started the interview without GPS. In Q64 respondent did not have any income but her/his son sent [amount redacted] the other day"
mydata$NEW_9_TEXT_Translation[1895] <- "Make [ocuppation redacted]"
mydata$NEW_10_TEXT_Translation[1554] <- "Shop [type redacted]"

#mydata <- mydata[!names(mydata) %in% "SrvyrComment"]

#' #GPS data: Displace
# Setup map

countrymap <- map_data("world") %>% filter(region=="Nepal")  #!!! Select correct country
#admin <- raster::getData("GADM", country="NP", level=0) #!!! Select correct country map using standard 2-letter country codes: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
admin <- readRDS(file="gadm36_NPL_0_sp.rds")

# Displace all pairs of GPS variables (Longitude, Latitude). Check summary statistics and maps before and after displacement. 

gps.vars <- c("Longitude", "Latitude") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("GPSinitial_LO", "GPSinitial_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("gps_CEa_LO", "gps_CEa_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

gps.vars <- c("gpsenumimp_LO", "gpsenumimp_LA") # !!!Include relevant variables, always longitude first, latitude second.
mydata <- displace(gps.vars, admin=admin, samp_num=1, other_num=100000) # May take a few minutes to process.

#' #Save processed data in stata and SPSS format
#' Adds "_PU" (Public Use) to the end of the name 

haven::write_dta(mydata, paste0(filename, "_PU.dta"))
haven::write_sav(mydata, paste0(filename, "_PU.sav")) 

